#!/usr/bin/env python
#coding:utf-8
#下载指定页面中所链接到的html文档，并保存到相应的目录中
#ver 1:来自于网络，添加了保存到目录的功能
import os
import urllib
from sgmllib import SGMLParser

class URLLister(SGMLParser):
    '''HTML解析类
    '''
    def reset(self):
        '''初始化类
        '''
        SGMLParser.reset(self)
        self.urls = []

    def start_a(self, attrs):
        '''解析html中的href链接
        '''
        href = [v for k, v in attrs if k == 'href']
        if href:
            self.urls.extend(href)
#直接读出首面，然后保存到文件
url = r'http://www.sinc.sunysb.edu/Clubs/buddhism/JinGangJingShuoShenMo/'
sock = urllib.urlopen(url)
htmlSource = sock.read()
sock.close()
f = file('jgj.html', 'w')
f.write(htmlSource)

#读出首页面中的链接文档，并保存到本地目录中的文件
mypath = r'http://www.sinc.sunysb.edu/Clubs/buddhism/JinGangJingShuoShenMo/'
parser = URLLister()
parser.feed(htmlSource)
for url in parser.urls:
    myurl = mypath + url
    print 'get:' + myurl
    sock2 = urllib.urlopen(myurl)
    html2 = sock2.read()
    sock2.close()

    #保存到文件
    print "save as:" + url
    if not os.path.isdir('tt/'):
        os.mkdir("tt")
    f2 = file(r"tt/"+url, 'w')
    f2.write(html2)
    f2.close()
